clear all
capture log close
program drop _all
set more off
snapshot erase _all

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** File name: 		Calculate Pareto-Adjusted P's and L's.do
*** Last updated: 	2/10/2016
***
*** This file reads in the cleaned p's and l's data, then
***		1. Calculates the pareto parameter alpha that will characterize the top of the adjusted income distribution
***		2. Calculates new pareto-adjusted p's and l's based on the calculated value of alpha
***		3. Saves the data:
***		   Output Data/Raw and Pareto-Adjusted P's and L's.dta
********************************************************************************************
********************************************************************************************
********************************************************************************************

cd "$directory"

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 1. Calculate the pareto parameter, alpha, that will characterize the top of the adjusted income distribution
*** 	a. Read in the data
***		b. Calculate the minium and maxium incomes recorded in the top decile of the survey
***		c. Calculate share of total income in the adjusted income distribution that was captured by the survey
***		d. Calculate alpha such the entire population captured by the survey (that is, all those with incomes at or below the highest incomes recorded in the survey)
*** 	   own a share of total income equal to the ratio of survey mean to national accounts mean 
***		e. Using these calculated alpha values, calculate what percent of the Pareto-adjusted distribution falls at or below the highest incomes recorded in the survey
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Read in the data
*****************************************************************
*****************************************************************

use "Output Data/Cleaned P's and L's.dta", clear

*****************************************************************
*****************************************************************
*** b. Calculate the minium and maxium incomes recorded in the top decile of the survey
*****************************************************************
*****************************************************************

**************************************
*** Mark the top survey decile
**************************************

*** Mark the p value closest to .9
gen diff = abs(.9 - p)
bys countryname year surveytype urbrur: egen min_diff = min(diff)
gen top_decile_p_temp = p if diff == min_diff
bys countryname year surveytype urbrur: egen top_decile_p = max(top_decile_p_temp) // This value is ~0.9, but it is not exact because our data does not capture precise percentiles.

*** Mark all p values at or above the p value closest to .9
gen top_decile = (p >= top_decile_p)

drop diff min_diff top_decile_p_temp

**************************************
*** Calcualate mean income for each percentile group
**************************************

*** Get the non-cumulative percent of population and percent of income for each percentile group in the top decile
sort countryname year surveytype urbrur p
by countryname year surveytype urbrur: gen p_diff = p - p[_n-1] if top_decile == 1
by countryname year surveytype urbrur: gen l_diff = l - l[_n-1] if top_decile == 1

*** Calcualate mean income for each percentile group
gen mean_y = (survey_mean*pop*l_diff)/(p_diff*pop)

**************************************
*** Get the mean income of the lowest and highest percentile for the top decile of each survey -- we will use these values as the minimum and maximum incomes recorded in the survey
**************************************
bys countryname year urbrur surveytype: egen min_y = min(mean_y)
bys countryname year urbrur surveytype: egen max_y = max(mean_y)

*****************************************************************
*****************************************************************
*** c. Calculate share of total income in the adjusted income distribution that was captured by the survey
*****************************************************************
*****************************************************************
  
**************************************
*** Calculate the ratio of survey mean to the total national income that will be captured by the adjusted distribution
*** based on the assumption that HALF of the survey-national accounts gap is due to missing top incomes
**************************************

*** Note: ratio will be missing if the survey mean is greater than the national accounts mean, or if we are missing national accounts data
gen ratio = 2*survey_mean/(survey_mean + na_mean) if na_mean > survey_mean & na_mean < .
label variable ratio "Share of income in adjusted distribution captured by survey"

**************************************
*** Calculate the ratio of the income top decile captured by the survey data to that amount plus half the survey-national accounts gap
*** based on the assumption that HALF of the survey-national accounts gap is due to missing top incomes
**************************************

*** Calculate what percent of survey income belongs to the bottom 90 percent
gen top_decile_l_temp = l if p == top_decile_p
bys countryname year urbrur surveytype: egen top_decile_l = max(top_decile_l_temp)

*** Calculate the ratio of the top decile captured by the survey data to the total national income that will be captured by the adjusted distribution
gen ratio2 = (1 - top_decile_l)*survey_mean / ((1 - top_decile_l) * survey_mean + .5*(na_mean - survey_mean)) if na_mean > survey_mean & na_mean < .

*****************************************************************
*****************************************************************
*** d. Calculate alpha such the entire population captured by the survey (that is, all those with incomes at or below the highest incomes recorded in the survey)
***    own a share of total income equal to the ratio of survey mean to national accounts mean 
*****************************************************************
*****************************************************************

**************************************
*** Calculate alpha
*** Note: We plug the CDF formula into the Lorenz curve formula to get share of wealth as a function of income and, solving for alpha, we get:
***		alpha = log(1-L)/log(y_min/y) + 1
*** We then evaluate this formula where 
***		L = ratio of the income top decile captured by the survey data to that amount plus half the survey-national accounts gap
***		y_min = minimum income in top decile of survey data
***		y = top income recorded in survey data
**************************************
gen alpha = log(1-ratio2)/log(min_y/max_y) + 1
label variable alpha "Pareto Parameter (alpha)"

assert alpha > 1
assert alpha < . if na_mean > survey_mean & na_mean < .

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 2. Calculate new pareto-adjusted p's and l's based on the calculated value of alpha
*** 	a. Using the calculated alpha values, calculate what percent of the top section of the income distribution falls at or below the highest incomes recorded in the survey
***		b. Impute Pareto values for the missing top incomes in the top section of the national distribution 
***		c. Rescale the original p and l values so that they refer to the bottom of the total income distribution
***		d. Rescale the Pareto-imputed p and l values for top income individuals so that they refer to the top of the total income distribution
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Using the calculated alpha values, calculate the top survey decile as a share of the top decile + new top incomes
*****************************************************************
*****************************************************************

**************************************
*** Use the Pareto CDF formula to calculate this value
**************************************
gen survey_pct_top = 1 - (min_y/max_y)^alpha

*****************************************************************
*****************************************************************
***	b. Impute Pareto values for the missing top incomes in the top section of the national distribution 
*****************************************************************
*****************************************************************

**************************************
*** Create new observations for our new top income values, one for each centile of missing top incomes in the top section of the national distribution
**************************************

*** Create the correct number of observations
sort countryname year urbrur surveytype p
by countryname year urbrur surveytype: gen num_new_obs = floor(100*(1-survey_pct_top)) + 1 if _n == _N
expand (num_new_obs), gen(new)

*** Assign new p values for these observations
sort countryname year urbrur surveytype p new
by countryname year urbrur surveytype: gen p_pareto = survey_pct_top + (1-survey_pct_top)*(sum(new)/num_new_obs) if new == 1

*** Add one additional value at p = 1 (which is necessary due to rounding)
drop num_new_obs
by countryname year urbrur surveytype: gen num_new_obs = 2 if _n == _N & new == 1
expand (num_new_obs), gen(new2)
replace p_pareto = 1 if new2 == 1

drop num_new_obs new2

*** Blank out the original p and l values for our newly created observations
replace p = . if new == 1
replace l = . if new == 1

**************************************
*** Create Pareto-imputed L values that correspond to these new p values
*** Note: These p and L values represent shares of the top section of the income distribution, not shares of the total income distribution
**************************************
gen l_pareto = 1 - (1-p_pareto)^(1-1/alpha) if new == 1

*****************************************************************
*****************************************************************
*** c. Rescale the original p and l values so that they refer to the bottom of the total income distribution
*****************************************************************
*****************************************************************

**************************************
*** Scale down original p values based on the percent of the total Pareto-adjusted distribution was captured by the survey
**************************************

*** Calculate what share of the total income distribution was captured by the survey
*** Note: The variable survey_pct_top is the share of the top section of the total income distribution that was captured by the survey, not the share of the total income distribution
gen survey_pct = 1/(1 + (1-top_decile_p)*(1-survey_pct_top))
label variable survey_pct "% of Pareto-Adjusted Population Captured by Survey"
 
*** Scale down the original p values based on this percent
gen p_adj = p*survey_pct if na_mean > survey_mean & na_mean < . & alpha < . & new == 0
label variable p_adj "Percent of Population, Adjusted for Missing Top Incomes"

**************************************
*** Scale down original l values based on what percent of total national income (as recorded in national accounts) was captured by the survey
**************************************
gen l_adj = l*ratio if na_mean > survey_mean & na_mean < . & alpha < . & new == 0
label variable l_adj "Percent of Income/Consumption, Adjusted for Missing Top Incomes"


*****************************************************************
*****************************************************************
*** d. Rescale the Pareto-imputed p and l values for top income individuals so that they refer to the top of the total income distribution
*****************************************************************
*****************************************************************

**************************************
*** Scale down the p values
**************************************
replace p_adj = p_pareto * (1-top_decile_p)*survey_pct + (1-survey_pct_top*(1-top_decile_p))*survey_pct if new == 1

**************************************
*** Scale down the l values based on the amount of income beloning to the top survey decile and the size of the survey-national accounts gap
**************************************
replace l_adj = ratio*top_decile_l + l_pareto * (1-ratio*top_decile_l) if new == 1

**************************************
*** Drop unneeded variables
**************************************
drop p_diff l_diff mean_y new min_y max_y top_decile* ratio2 survey_pct_top p_pareto l_pareto

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 3. Save the data
********************************************************************************************
********************************************************************************************
********************************************************************************************
save "Output Data/Raw and Pareto-Adjusted P's and L's.dta", replace
